
library(dplyr)

# Pretest data -------------------
p_df <- read.csv("./raw_data/pretest_data.csv", stringsAsFactors = FALSE) %>%
  # Drop first two rows of metdata
  filter(row_number() > 2) %>%
  # Drop low quality 
  filter(!completioncode %in% c(30437, 71572, 50805, 87395, 72736, 67291, 71312, 49548,
                                60716, 85517, 82230, 94554, 11694, 36338, 97550) | 
           ResponseId=="R_1gcm3PSKIXOnMrD")

# Save for future use 
write.csv(p_df, "./clean_data/clean_pretest_data.csv")


# Full data ---------------------------------
# Read in data
tempdat <- read.csv("./raw_data/survey_data.csv", na.strings = "-99")

# First two rows are metadata
df <- tempdat[3:nrow(tempdat),]

# Drop disqualified respondents 
df <- df %>%
  filter(Q39 == "I agree to participate in this study." & 
           Age >= 18 & Q37=="Yes" & Q38=="Yes" & Q61=="Yes") %>%
  # Keep only those that finish
  filter(Finished=="True") %>%
  # Note: here, missing treatment status indicates an incomplete survey
  mutate(treated = ifelse(Group=="", NA, as.integer(Group=="Treatment"))) %>%
  # Drop respondents without treatment status
  filter(!is.na(treated))

# Create quality flag 
df <- df %>%
  mutate(bot = as.integer(as.numeric(Q_RecaptchaScore) < 0.5),
         speeder = ifelse(treated==1, 
                          as.integer(as.numeric(Duration..in.seconds.) < 
                                       0.33 * median(as.numeric(df$Duration..in.seconds.[df$treated==1]), na.rm = TRUE)),
                          as.integer(as.numeric(Duration..in.seconds.) < 
                                       0.33 * median(as.numeric(df$Duration..in.seconds.[df$treated==0]), na.rm = TRUE))),
         fail_att_check = as.integer(Attention.Check.2 != "Yes, I have devoted full attention to the questions so far and I think you should use my responses for your study.")
  ) 

table(df$speeder)
table(df$speeder, df$fail_att_check)

df <- df %>%
  filter(speeder==0)

# Function to recode likert values
# Note: rev==TRUE indicates that greater item agreement implies lower implicit variable value
likert_recode <- function(x, rev) {
  if (is.na(x)) {out <- NA}
  else if (rev == FALSE) {
    out <- case_when(x=="Strongly agree" ~ 5,
                     x=="Somewhat agree" ~ 4,
                     x=="Neither agree nor disagree" ~ 3,
                     x=="Somewhat disagree" ~ 2,
                     x=="Strongly disagree" ~ 1)
  }
  else {
    out <- case_when(x=="Strongly agree" ~ 1,
                     x=="Somewhat agree" ~ 2,
                     x=="Neither agree nor disagree" ~ 3,
                     x=="Somewhat disagree" ~ 4,
                     x=="Strongly disagree" ~ 5)
  }
  return(out)
}

# ----- Create pre-treatment covariates

# Re-code RR items and create index 
for (i in 1:nrow(df)) {
  df$rr1[i] <- likert_recode(df$Racial.Resentment_1[i], rev=FALSE)
  df$rr2[i] <- likert_recode(df$Racial.Resentment_2[i], rev=TRUE)
  df$rr3[i] <- likert_recode(df$Racial.Resentment_3[i], rev=TRUE)
  df$rr4[i] <- likert_recode(df$Racial.Resentment_4[i], rev=FALSE)
  df$rr[i] <- mean(c(df$rr1[i], df$rr2[i], df$rr3[i], df$rr4[i]), na.rm = TRUE)
}

# Other covariates 
df <- df %>%
  mutate(state = Q51,
         region = case_when(state %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire",
                                         "Rhode Island", "Vermont", "New Jersey", "New York",
                                         "Pennsylvania") ~ "Northeast",
                            state %in% c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin",
                                         "Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", 
                                         "North Dakota", "South Dakota") ~ "Midwest",
                            state %in% c("Delaware", "Florida", "Georgia", "Maryland",
                                         "North Carolina", "South Carolina", "Virginia",
                                         "Washington, D.C.", "West Virginia", "Alabama", 
                                         "Kentucky", "Mississippi", "Tennessee", "Arkansas", 
                                         "Louisiana", "Oklahoma", "Texas") ~ "South",
                            state %in% c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", 
                                         "New Mexico", "Utah", "Wyoming", "Alaska", "California", 
                                         "Hawaii", "Oregon", "Washington") ~ "West"
                            ),
         age = as.numeric(Age),
         age_cat = case_when(age < 25 ~ "Under 25",
                             age >= 25 & age < 34 ~ "25 to 34",
                             age >= 35 & age < 44 ~ "35 to 44",
                             age >= 45 & age < 54 ~ "45 to 54",
                             age >= 55 & age < 64 ~ "55 to 64",
                             age > 64 ~ "65+"),
         man = as.integer(Gender=="Man"),
         race = ifelse(grepl(pattern=",", x=Race), "Multi-racial",
                              ifelse(Race=="Other (please specify):", "Other", Race)),
         race = ifelse(race == "White", "White", 
                       ifelse(race == "Black or African American", "Black", "Other")),
         race_id_import = case_when(Q48=="" ~ NA,
                                    Q48=="Not at all important" ~ 1,
                                    Q48=="Slightly important" ~ 2,
                                    Q48=="Moderately important" ~ 3,
                                    Q48=="Very important" ~ 4,
                                    Q48=="Extremely important" ~ 5),
         left_right = as.numeric(Q52_1),
         pol_interest = as.numeric(Political.Interest_1),
         Senate.knowledge = ifelse(is.na(Senate.knowledge), "DK", Senate.knowledge),
         House.knowledge = ifelse(is.na(House.knowledge), "DK", House.knowledge),
         pol_know1 = as.integer(Senate.knowledge=="Democratic Party"),
         pol_know2 = as.integer(House.knowledge=="Republican Party"),
         pol_know = (pol_know1 + pol_know2)/2,
         party_id = case_when(Partisanship.A == "A Democrat" ~ "Democrat",
                              Partisanship.A == "A Republican" ~ "Republican",
                              Partisanship.A %in% c("An Independent", "Other (please specify)")
                              ~ "Independent"),
         usa_pride = as.numeric(American.Identity_1),
         rr_cat = case_when(rr <= 2 ~ "Low",
                            rr > 2 & rr < 4 ~ "Medium", 
                            rr >= 4 ~ "High"),
         attention = as.integer(Attention.Check.2 == "Yes, I have devoted full attention to the questions so far and I think you should use my responses for your study."),
         educ = case_when(Education == "Less than high school degree" ~ 1,
                          Education == "High school degree or GED" ~ 2,
                          Education == "Technical or community college diploma" ~ 3,
                          Education == "Bachelor's degree (e.g. BA, AB, BS)" ~ 4,
                          Education == "Master's or professional school degree (e.g. MA, MS, MEng, MEd, MSW, MBA, MD, DDS, DVM, LLB, JD)" ~ 5,
                          Education == "Doctorate degree (e.g. PhD, EdD)" ~ 6),
         hh_income_cat = case_when(Q50 == "$1 to $30,000" ~ 15000,
                                   Q50 == "$30,001 to $60,000" ~ 45000,
                                   Q50 == "$60,001 to $90,000" ~ 75000,
                                   Q50 == "$90,001 to $110,000" ~ 100000,
                                   Q50 == "$110,001 to $150,000" ~ 120000,
                                   Q50 == "$150,001 to $200,000" ~ 175000,
                                   Q50 == "More than $250,000" ~ 250000),
         hh_income = ifelse(is.na(Q49) | Q49=="", hh_income_cat, as.numeric(Q49)),
         treatment_time = as.numeric(Q54_Page.Submit) + as.numeric(Q67_Page.Submit) + 
           as.numeric(Q68_Page.Submit) + as.numeric(Q69_Page.Submit),
         est_removals_ctrl = ifelse(Q59=="", NA, as.numeric(Q59)),
         est_removals_trt = ifelse(Treatment.attention.=="", NA, as.numeric(Treatment.attention.)),
         est_removals = ifelse(treated==0, est_removals_ctrl, est_removals_trt),
         statue_factcheck_ctrl = ifelse(treated==1, NA,
                                        ifelse(is.na(Q59)|Q59=="", 0,
                                               as.integer(Q59=="200"))),
         statue_factcheck_trt = ifelse(treated==0, NA,
                                       ifelse(is.na(Treatment.attention.)|Treatment.attention.=="", 0,
                                       as.integer(Treatment.attention.=="200"))),
         statue_factcheck = ifelse(treated==0, statue_factcheck_ctrl, statue_factcheck_trt),
         treatment_openended = Treatment.open.ended,
         treatment_surprise = ifelse(treated==0, NA, Treatment.newness)
         ) 

# Standardize continuous variables 
df$age_z <- (df$age - mean(df$age, na.rm = TRUE)) / sd(df$age, na.rm = TRUE)
df$race_id_import_z <- (df$race_id_import - mean(df$race_id_import, na.rm = TRUE)) / sd(df$race_id_import, na.rm = TRUE)
df$left_right_z <- (df$left_right - mean(df$left_right, na.rm = TRUE)) / sd(df$left_right, na.rm = TRUE)
df$pol_interest_z <- (df$pol_interest - mean(df$pol_interest, na.rm = TRUE)) / sd(df$pol_interest, na.rm = TRUE)
df$pol_know_z <- (df$pol_know - mean(df$pol_know, na.rm = TRUE)) / sd(df$pol_know, na.rm = TRUE)
df$usa_pride_z <- (df$usa_pride - mean(df$usa_pride, na.rm = TRUE)) / sd(df$usa_pride, na.rm = TRUE)
df$rr_z <- (df$rr - mean(df$rr, na.rm = TRUE)) / sd(df$rr, na.rm = TRUE)
df$educ_z <- (df$educ - mean(df$educ, na.rm = TRUE)) / sd(df$educ, na.rm = TRUE)
df$hh_income_z <- (df$hh_income - mean(df$hh_income, na.rm = TRUE)) / sd(df$hh_income, na.rm = TRUE)

# Create shorthand variables for HTE analysis
df <- df %>%
  mutate(south=as.integer(region=="South"),
         hi_pol_interest = as.integer(pol_interest >= median(pol_interest, na.rm=TRUE)))

# ----- Generate outcome variables 
df <- df %>%
  mutate(
    # National lynching memorial 
    symbol1 = as.numeric(symbolic.policy1_1),
    # Juneteenth celebration
    symbol2 = as.numeric(symbolic.policy1_2),
    # History museum field trips 
    symbol3 = as.numeric(symbolic.policy1_3),
    # Expand DEI hiring initiatives
    redist1 = as.numeric(symbolic.policy1_4),
    # Zoning rules to desegregate
    redist2 = as.numeric(symbolic.policy1_5),
    # Reallocate police budget
    redist3 = as.numeric(symbolic.policy1_6),
    # Prevent renamings of Confederate places
    backlash1 = as.numeric(symbolic.policy1_7),
    # Revoke housing anti-discrimination rules
    backlash2 = as.numeric(symbolic.policy1_8),
    # Increase welfare work requirements 
    backlash3 = as.numeric(symbolic.policy1_9),
    # Require election official residency 
    placebo_policy1 = as.numeric(symbolic.policy1_10))

# Calculate item-level z-scores 
for (i in 1:nrow(df)) {
  df$symbol1_z[i] <- (df$symbol1[i] - mean(df$symbol1[df$treated==0], na.rm = TRUE)) / 
    sd(df$symbol1[df$treated==0], na.rm = TRUE)
  df$symbol2_z[i] <- (df$symbol2[i] - mean(df$symbol2[df$treated==0], na.rm = TRUE)) / 
    sd(df$symbol2[df$treated==0], na.rm = TRUE)
  df$symbol3_z[i] <- (df$symbol3[i] - mean(df$symbol3[df$treated==0], na.rm = TRUE)) / 
    sd(df$symbol3[df$treated==0], na.rm = TRUE)
  df$redist1_z[i] <- (df$redist1[i] - mean(df$redist1[df$treated==0], na.rm = TRUE)) / 
    sd(df$redist1[df$treated==0], na.rm = TRUE)
  df$redist2_z[i] <- (df$redist2[i] - mean(df$redist2[df$treated==0], na.rm = TRUE)) / 
    sd(df$redist2[df$treated==0], na.rm = TRUE)
  df$redist3_z[i] <- (df$redist3[i] - mean(df$redist3[df$treated==0], na.rm = TRUE)) / 
    sd(df$redist3[df$treated==0], na.rm = TRUE)
  df$backlash1_z[i] <- (df$backlash1[i] - mean(df$backlash1[df$treated==0], na.rm = TRUE)) / 
    sd(df$backlash1[df$treated==0], na.rm = TRUE)
  df$backlash2_z[i] <- (df$backlash2[i] - mean(df$backlash2[df$treated==0], na.rm = TRUE)) / 
    sd(df$backlash2[df$treated==0], na.rm = TRUE)
  df$backlash3_z[i] <- (df$backlash3[i] - mean(df$backlash3[df$treated==0], na.rm = TRUE)) / 
    sd(df$backlash3[df$treated==0], na.rm = TRUE)
  df$placebo_policy1_z[i] <- (df$placebo_policy1[i] - mean(df$placebo_policy1[df$treated==0], na.rm = TRUE)) / 
    sd(df$placebo_policy1[df$treated==0], na.rm = TRUE)
}

# Take respondent-level average z-scores within each policy category 
for (i in 1:nrow(df)) {
  
  df$symbol_idx_n[i] = mean(c(df$symbol1[i], df$symbol2[i], df$symbol3[i]), na.m = TRUE)
  df$redist_idx_n[i] = mean(c(df$redist1[i], df$redist2[i], df$redist3[i]), na.m = TRUE)
  df$backlash_idx_n[i] = mean(c(df$backlash1[i], df$backlash2[i], df$backlash3[i]), na.m = TRUE)
  
  df$symbol_idx[i] = mean(c(df$symbol1_z[i], df$symbol2_z[i], df$symbol3_z[i]), na.m = TRUE)
  df$redist_idx[i] = mean(c(df$redist1_z[i], df$redist2_z[i], df$redist3_z[i]), na.m = TRUE)
  df$backlash_idx[i] = mean(c(df$backlash1_z[i], df$backlash2_z[i], df$backlash3_z[i]), na.m = TRUE)
}

# Re-calculate z-scores on indices 
for (i in 1:nrow(df)) {
  df$symbol_idx_z[i] <- (df$symbol_idx[i] - mean(df$symbol_idx[df$treated==0], na.rm = TRUE)) / 
    sd(df$symbol_idx[df$treated==0], na.rm = TRUE)
  df$redist_idx_z[i] <- (df$redist_idx[i] - mean(df$redist_idx[df$treated==0], na.rm = TRUE)) / 
    sd(df$redist_idx[df$treated==0], na.rm = TRUE)
  df$backlash_idx_z[i] <- (df$backlash_idx[i] - mean(df$backlash_idx[df$treated==0], na.rm = TRUE)) / 
    sd(df$backlash_idx[df$treated==0], na.rm = TRUE)
}

# Create outcome for single-preference item 
df <- df %>%
  mutate(policy_pref_forced = case_when(constrained.policy == "Symbolic acts, like removing Confederate statues and flags, to confront our difficult history." ~
                                          "Symbolic",
                                        constrained.policy == "Redistributive reforms, like expanding affirmative action, to improve opportunities for underrepresented minorities." ~
                                          "Redistributive",
                                        constrained.policy == "Neither of the above." ~
                                          "Neither"),
         policy_pref_forced_num = case_when(policy_pref_forced=="Neither" ~ 1,
                                            policy_pref_forced=="Symbolic" ~ 2,
                                            policy_pref_forced=="Redistributive" ~ 3)
  )

# ----- Create mechanism checks 
for (i in 1:nrow(df)) {
  df$mech_priority_self[i] <- likert_recode(df$Crowding.in_1[i], rev=FALSE)
  df$mech_priority_others[i] <- likert_recode(df$Crowding.in_2[i], rev=TRUE)
  df$mech_opportunity[i] <- likert_recode(df$Crowding.in_3[i], rev=FALSE)
  df$mech_reputation[i] <- likert_recode(df$Crowding.in_4[i], rev=FALSE)
  df$mech_identity_threat[i] <- likert_recode(df$Crowding.in_5[i], rev=FALSE)
  df$mech_symbols_distract[i] <- likert_recode(df$Crowding.in_6[i], rev=FALSE)
  df$mech_gov_intervention[i] <- likert_recode(df$Crowding.in_7[i], rev=FALSE)
}

df <- df %>%
  mutate(mech_progress = case_when(Crowding.out == "Too much has been done on racial justice." ~ 3,
                                    Crowding.out == "Enough has been done on racial justice." ~ 2,
                                    Crowding.out == "There is still a lot more to do on racial justice." ~ 1))

# Standardize mechanism variables 
df$mech_priority_self_z <- (df$mech_priority_self - mean(df$mech_priority_self, na.rm = TRUE)) / 
  sd(df$mech_priority_self, na.rm = TRUE)
df$mech_priority_others_z <- (df$mech_priority_others - mean(df$mech_priority_others, na.rm = TRUE)) / 
  sd(df$mech_priority_others, na.rm = TRUE)
df$mech_opportunity_z <- (df$mech_opportunity - mean(df$mech_opportunity, na.rm = TRUE)) / 
  sd(df$mech_opportunity, na.rm = TRUE)
df$mech_reputation_z <- (df$mech_reputation - mean(df$mech_reputation, na.rm = TRUE)) / 
  sd(df$mech_reputation, na.rm = TRUE)
df$mech_identity_threat_z <- (df$mech_identity_threat - mean(df$mech_identity_threat, na.rm = TRUE)) / 
  sd(df$mech_identity_threat, na.rm = TRUE)
df$mech_symbols_distract_z <- (df$mech_symbols_distract - mean(df$mech_symbols_distract, na.rm = TRUE)) / 
  sd(df$mech_symbols_distract, na.rm = TRUE)
df$mech_gov_intervention_z <- (df$mech_gov_intervention - mean(df$mech_gov_intervention, na.rm = TRUE)) / 
  sd(df$mech_gov_intervention, na.rm = TRUE)
df$mech_progress_z <- (df$mech_progress - mean(df$mech_progress, na.rm = TRUE)) / 
  sd(df$mech_progress, na.rm = TRUE)


# Read in hand-coded text data 
hc <- read.csv("./raw_data/openended_handcode.csv") %>%
  filter(!code1 %in% c("1", "1c")) %>%
  rowwise() %>%
  # Create indicator for whether respondent reacted with opposition or support 
  mutate(oppose1 = as.integer(grepl("3", x=code1)),
         oppose2 = as.integer(grepl("3", x=code2)),
         oppose3 = as.integer(grepl("3", x=code3)), 
         txt_oppose = max(oppose1, oppose2, oppose3),
         support1 = as.integer(grepl("4", x=code1)),
         support2 = as.integer(grepl("4", x=code2)),
         support3 = as.integer(grepl("4", x=code3)), 
         txt_support = max(support1, support2, support3)) %>%
  select(ResponseId, txt_support, txt_oppose)

df <- df %>% 
  left_join(hc)

# Save cleaned data 
write.csv(df, "./clean_data/clean_survey_data.csv", row.names = FALSE)
rm(list=ls())

